library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.0
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(rtweet)
##
## Attaching package: 'rtweet'
##
## The following object is masked from 'package:purrr':
##
## flatten
library(lubridate)
## Loading required package: timechange
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library("readxl")
library(ggplot2)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
library(penalized)
## Welcome to penalized. For extended examples, see vignette("penalized").
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(corrplot)
## corrplot 0.92 loaded
library(reticulate)
library(ggmap)
## ℹ Google's Terms of Service: <]8;;https://mapsplatform.google.comhttps://mapsplatform.google.com]8;;>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggmap':
##
## wind
##
## The following object is masked from 'package:MASS':
##
## select
##
## The following object is masked from 'package:Hmisc':
##
## subplot
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library('DT')
library(future)
##
## Attaching package: 'future'
##
## The following object is masked from 'package:survival':
##
## cluster
df <- py$test
names(df)
## [1] "id"
## [2] "mlsListingId"
## [3] "mlsOrgId"
## [4] "agentId"
## [5] "agentName"
## [6] "agentOrganizationName"
## [7] "agentBrePrefix"
## [8] "openHouseSchedule"
## [9] "propertyType"
## [10] "picturesNum"
## [11] "tour3D"
## [12] "tour3Ds"
## [13] "openHouseFlag"
## [14] "flag"
## [15] "primaryType"
## [16] "secondaryType"
## [17] "hoaFee"
## [18] "listingSource"
## [19] "lotSize"
## [20] "chimeFirstInserted"
## [21] "hasPriceChanged"
## [22] "builtYear"
## [23] "oldPrice"
## [24] "mlsUpdateL"
## [25] "newListingFlag"
## [26] "backOnMarketFlag"
## [27] "daysOnList"
## [28] "priceChange"
## [29] "hasPriceChangedV1"
## [30] "hasStatusChange"
## [31] "mlsListDateL"
## [32] "mlsListDateLSort"
## [33] "newUpdateTime"
## [34] "openHouseSchedules"
## [35] "openHouseDesc"
## [36] "mlsUpdateTimeL"
## [37] "updateTimeV2"
## [38] "price"
## [39] "ceilingPrice"
## [40] "bedrooms"
## [41] "bathrooms"
## [42] "fullBaths"
## [43] "halfBaths"
## [44] "quarterBaths"
## [45] "threeQuarterBaths"
## [46] "sqft"
## [47] "detailsDescribe"
## [48] "spaceRent"
## [49] "locationId"
## [50] "city"
## [51] "state"
## [52] "zipCode"
## [53] "previewPicture"
## [54] "streetAddress"
## [55] "longitude"
## [56] "latitude"
## [57] "waterfrontFlag"
## [58] "specialListingCondition"
## [59] "detailUrl"
## [60] "updateTimeS"
## [61] "lastPrimaryChangeTime"
## [62] "collectStatus"
## [63] "generateTime"
## [64] "addressPartialFlag"
## [65] "mlsOrg"
## [66] "bidLevel"
## [67] "createDate"
## [68] "updateDate"
## [69] "coAgentName"
## [70] "coAgentOrgName"
## [71] "elementarySchool"
## [72] "middleSchool"
## [73] "highSchool"
## [74] "schoolDistrict"
## [75] "oldValue"
## [76] "monthlyTotalFees"
## [77] "coSellingAgentStatus"
## [78] "coAgentId"
## [79] "commercialRealEstate"
## [80] "listingPictures"
## [81] "resourceKey"
## [82] "leadCount"
## [83] "statusChangeOrSoldDate"
## [84] "mlsListingDate"
## [85] "totalAvailableAcres"
## [86] "location"
## [87] "taxAmount"
## [88] "showPreviousNext"
## [89] "previousNextSource"
## [90] "detailLink"
## [91] "listingStatus"
## [92] "coAgent"
## [93] "appDetailLink"
## [94] "listingProvided"
## [95] "address"
## [96] "flagText"
## [97] "listingStatusText"
## [98] "propertyTypeText"
## [99] "link"
## [100] "communityFeatures"
## [101] "petsDescription"
## [102] "stories"
## [103] "chimeVideoLink"
## [104] "statusUpdateTime"
## [105] "priceUpdateTime"
## [106] "elementarySchoolV1"
## [107] "middleSchoolV1"
## [108] "highSchoolV1"
## [109] "schoolDistrictV1"
## [110] "Prop_Merge_Address"
## [111] "pin"
## [112] "pin10"
## [113] "year"
## [114] "class_x"
## [115] "triad_name"
## [116] "triad_code"
## [117] "township_name"
## [118] "township_code"
## [119] "nbhd_code"
## [120] "tax_code"
## [121] "tieback_proration_rate"
## [122] "lon"
## [123] "lat"
## [124] "x_3435"
## [125] "y_3435"
## [126] "prop_address_full"
## [127] "prop_address_city_name"
## [128] "prop_address_state"
## [129] "prop_address_zipcode_1"
## [130] "mail_address_name"
## [131] "mail_address_full"
## [132] "mail_address_city_name"
## [133] "mail_address_state"
## [134] "mail_address_zipcode_1"
## [135] "census_block_group_geoid"
## [136] "census_block_geoid"
## [137] "census_congressional_district_geoid"
## [138] "census_county_subdivision_geoid"
## [139] "census_place_geoid"
## [140] "census_puma_geoid"
## [141] "census_school_district_unified_geoid"
## [142] "census_state_representative_geoid"
## [143] "census_state_senate_geoid"
## [144] "census_tract_geoid"
## [145] "census_zcta_geoid"
## [146] "census_data_year"
## [147] "census_acs5_congressional_district_geoid"
## [148] "census_acs5_county_subdivision_geoid"
## [149] "census_acs5_place_geoid"
## [150] "census_acs5_puma_geoid"
## [151] "census_acs5_school_district_unified_geoid"
## [152] "census_acs5_state_representative_geoid"
## [153] "census_acs5_state_senate_geoid"
## [154] "census_acs5_tract_geoid"
## [155] "census_acs5_data_year"
## [156] "cook_board_of_review_district_num"
## [157] "cook_board_of_review_district_data_year"
## [158] "cook_commissioner_district_num"
## [159] "cook_commissioner_district_data_year"
## [160] "cook_judicial_district_num"
## [161] "cook_judicial_district_data_year"
## [162] "cook_municipality_num"
## [163] "cook_municipality_name"
## [164] "cook_municipality_data_year"
## [165] "ward_num"
## [166] "ward_data_year"
## [167] "chicago_community_area_num"
## [168] "chicago_community_area_name"
## [169] "chicago_community_area_data_year"
## [170] "chicago_police_district_num"
## [171] "chicago_police_district_data_year"
## [172] "env_flood_fema_sfha"
## [173] "env_flood_fema_data_year"
## [174] "env_flood_fs_factor"
## [175] "env_flood_fs_risk_direction"
## [176] "env_flood_fs_data_year"
## [177] "env_ohare_noise_contour_no_buffer_bool"
## [178] "env_ohare_noise_contour_half_mile_buffer_bool"
## [179] "env_ohare_noise_contour_data_year"
## [180] "env_airport_noise_dnl"
## [181] "env_airport_noise_data_year"
## [182] "school_elementary_district_geoid"
## [183] "school_elementary_district_name"
## [184] "school_secondary_district_geoid"
## [185] "school_secondary_district_name"
## [186] "school_school_year"
## [187] "school_data_year"
## [188] "tax_community_college_district"
## [189] "tax_community_college_district_name"
## [190] "tax_community_college_district_data_year"
## [191] "tax_park_district_num"
## [192] "tax_park_district_name"
## [193] "tax_park_district_data_year"
## [194] "access_cmap_walk_id"
## [195] "access_cmap_walk_nta_score"
## [196] "access_cmap_walk_total_score"
## [197] "access_cmap_walk_data_year"
## [198] "misc_subdivision_id"
## [199] "misc_subdivision_data_year"
## [200] "num_pin_in_half_mile"
## [201] "num_bus_stop_in_half_mile"
## [202] "num_bus_stop_data_year"
## [203] "num_foreclosure_in_half_mile_past_5_years"
## [204] "num_foreclosure_per_1000_pin_past_5_years"
## [205] "num_foreclosure_data_year"
## [206] "num_school_in_half_mile"
## [207] "num_school_data_year"
## [208] "nearest_bike_trail_dist_ft"
## [209] "nearest_bike_trail_data_year"
## [210] "nearest_cemetery_gnis_code"
## [211] "nearest_cemetery_name"
## [212] "nearest_cemetery_dist_ft"
## [213] "nearest_cemetery_data_year"
## [214] "nearest_cta_route_id"
## [215] "nearest_cta_route_name"
## [216] "nearest_cta_route_dist_ft"
## [217] "nearest_cta_route_data_year"
## [218] "nearest_cta_stop_id"
## [219] "nearest_cta_stop_name"
## [220] "nearest_cta_stop_dist_ft"
## [221] "nearest_cta_stop_data_year"
## [222] "nearest_golf_course_id"
## [223] "nearest_golf_course_dist_ft"
## [224] "nearest_golf_course_data_year"
## [225] "nearest_hospital_gnis_code"
## [226] "nearest_hospital_name"
## [227] "nearest_hospital_dist_ft"
## [228] "nearest_hospital_data_year"
## [229] "lake_michigan_dist_ft"
## [230] "lake_michigan_data_year"
## [231] "nearest_major_road_osm_id"
## [232] "nearest_major_road_name"
## [233] "nearest_major_road_dist_ft"
## [234] "nearest_major_road_data_year"
## [235] "nearest_metra_route_id"
## [236] "nearest_metra_route_name"
## [237] "nearest_metra_route_dist_ft"
## [238] "nearest_metra_route_data"
## [239] "nearest_metra_stop_id"
## [240] "nearest_metra_stop_name"
## [241] "nearest_metra_stop_dist_ft"
## [242] "nearest_metra_stop_data_year"
## [243] "nearest_park_osm_id"
## [244] "nearest_park_name"
## [245] "nearest_park_dist_ft"
## [246] "nearest_park_data_year"
## [247] "nearest_railroad_id"
## [248] "nearest_railroad_name"
## [249] "nearest_railroad_dist_ft"
## [250] "nearest_railroad_data_year"
## [251] "nearest_water_id"
## [252] "nearest_water_name"
## [253] "nearest_water_dist_ft"
## [254] "nearest_water_data_year"
## [255] "nearest_neighbor_1_pin10"
## [256] "nearest_neighbor_1_dist_ft"
## [257] "nearest_neighbor_2_pin10"
## [258] "nearest_neighbor_2_dist_ft"
## [259] "nearest_neighbor_3_pin10"
## [260] "nearest_neighbor_3_dist_ft"
## [261] "nearest_bike_trail_id"
## [262] "econ_qualified_opportunity_zone_num"
## [263] "econ_qualified_opportunity_zone_data_year"
## [264] "tax_tif_district_num"
## [265] "tax_tif_district_name"
## [266] "tax_tif_district_data_year"
## [267] "tieback_key_pin"
## [268] "tax_special_service_area_num"
## [269] "tax_special_service_area_name"
## [270] "tax_special_service_area_data_year"
## [271] "nearest_bike_trail_name"
## [272] "chicago_industrial_corridor_num"
## [273] "chicago_industrial_corridor_name"
## [274] "chicago_industrial_corridor_data_year"
## [275] "census_school_district_elementary_geoid"
## [276] "census_school_district_secondary_geoid"
## [277] "census_acs5_school_district_elementary_geoid"
## [278] "census_acs5_school_district_secondary_geoid"
## [279] "econ_coordinated_care_area_num"
## [280] "econ_coordinated_care_area_data_year"
## [281] "tax_sanitation_district_num"
## [282] "tax_sanitation_district_name"
## [283] "tax_sanitation_district_data_year"
## [284] "econ_enterprise_zone_num"
## [285] "econ_enterprise_zone_data_year"
## [286] "econ_industrial_growth_zone_num"
## [287] "econ_industrial_growth_zone_data_year"
## [288] "tax_library_district_num"
## [289] "tax_library_district_name"
## [290] "tax_library_district_data_year"
## [291] "tax_fire_protection_district_num"
## [292] "tax_fire_protection_district_name"
## [293] "tax_fire_protection_district_data_year"
## [294] "pin2"
## [295] "certified"
## [296] "first_pass"
## [297] "class_y"
## [298] "tax_year"
## [299] "nbhd"
## [300] "hd_sf"
## [301] "town_code"
## [302] "type_resd"
## [303] "apts"
## [304] "ext_wall"
## [305] "roof_cnst"
## [306] "rooms"
## [307] "beds"
## [308] "bsmt"
## [309] "bsmt_fin"
## [310] "heat"
## [311] "oheat"
## [312] "air"
## [313] "frpl"
## [314] "attic_type"
## [315] "attic_fnsh"
## [316] "hbath"
## [317] "tp_plan"
## [318] "tp_dsgn"
## [319] "cnst_qlty"
## [320] "site"
## [321] "gar1_size"
## [322] "gar1_cnst"
## [323] "gar1_att"
## [324] "gar1_area"
## [325] "gar2_size"
## [326] "gar2_cnst"
## [327] "gar2_att"
## [328] "gar2_area"
## [329] "porch"
## [330] "ot_impr"
## [331] "bldg_sf"
## [332] "repair_cnd"
## [333] "multi_code"
## [334] "ncu"
## [335] "pri_est_land"
## [336] "pri_est_bldg"
## [337] "centroid_x"
## [338] "centroid_y"
## [339] "tractce"
## [340] "multi_ind"
## [341] "addr"
## [342] "modeling_group"
## [343] "fbath"
## [344] "age"
## [345] "use_1"
## [346] "o_hare_noise"
## [347] "floodplain"
## [348] "near_major_road"
## [349] "total_units"
## [350] "age_squared"
## [351] "age_decade"
## [352] "age_decade_squared"
## [353] "lot_size_squared"
## [354] "improvement_size_squared"
## [355] "location_factor"
## [356] "garage_indicator"
## [357] "pure_market_sale"
## [358] "pure_market_filter"
## [359] "neigborhood_code_mapping_"
## [360] "square_root_of_lot_size"
## [361] "square_root_of_age"
## [362] "square_root_of_improvement_size"
## [363] "town_and_neighborhood"
## [364] "most_recent_sale_date"
## [365] "doc_no"
## [366] "most_recent_sale_price"
## [367] "deed_type"
## [368] "n_units"
## [369] "per_ass"
## [370] "condo_class_factor"
## [371] "residential_share_of_building"
## [372] "condition_desirability_and_utility"
## [373] "condo_strata"
## [374] "multi_family_ind"
## [375] "renovation"
## [376] "total_bldg_sf"
df$longitude = as.numeric(df$longitude) * 100000
df$latitude = as.numeric(df$latitude) * 100000
df$id <- as.factor(df$id)
df <- df %>%
filter(primaryType == 'Residential', sqft != 0,
builtYear > 0) %>%
mutate(byexp = (2023 - builtYear) ** 2) %>%
distinct(id, .keep_all = TRUE)
ggplot(data = df, aes(x = price)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

df_cols <- df %>%
dplyr::select(price, builtYear, bedrooms, bathrooms, sqft, longitude, latitude)
corrplot.mixed(corr = cor(df_cols, use = 'pairwise.complete.obs'), order = 'AOE')

lin_mod = lm(price ~ 1 + builtYear + bedrooms + certified +
bathrooms + sqft + longitude +
latitude ,data = df)
rlin_mod = rlm(price ~ 1 + builtYear + bedrooms + certified +
bathrooms + sqft + longitude +
latitude ,data = df, psi = psi.bisquare)
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
summary(lin_mod)
##
## Call:
## lm(formula = price ~ 1 + builtYear + bedrooms + certified + bathrooms +
## sqft + longitude + latitude, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1969132 -79346 5100 71928 8812620
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.693e+06 6.754e+06 -0.251 0.802049
## builtYear 4.511e+02 1.267e+02 3.559 0.000376 ***
## bedrooms -3.068e+04 5.777e+03 -5.310 1.15e-07 ***
## certified 8.704e-01 1.150e-02 75.669 < 2e-16 ***
## bathrooms 8.415e+04 8.099e+03 10.390 < 2e-16 ***
## sqft 1.626e+02 7.201e+00 22.582 < 2e-16 ***
## longitude 5.872e-01 8.825e-01 0.665 0.505839
## latitude 1.376e+00 6.079e-01 2.264 0.023635 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 274500 on 4088 degrees of freedom
## (409 observations deleted due to missingness)
## Multiple R-squared: 0.8362, Adjusted R-squared: 0.8359
## F-statistic: 2981 on 7 and 4088 DF, p-value: < 2.2e-16
summary(rlin_mod)
##
## Call: rlm(formula = price ~ 1 + builtYear + bedrooms + certified +
## bathrooms + sqft + longitude + latitude, data = df, psi = psi.bisquare)
## Residuals:
## Min 1Q Median 3Q Max
## -1362534 -41926 3727 51310 10115656
##
## Coefficients:
## Value Std. Error t value
## (Intercept) -1.529534e+07 1.835088e+06 -8.334900e+00
## builtYear 2.540033e+02 3.443450e+01 7.376400e+00
## bedrooms -2.739048e+03 1.569707e+03 -1.744900e+00
## certified 8.680000e-01 3.100000e-03 2.777364e+02
## bathrooms 4.577282e+04 2.200640e+03 2.079980e+01
## sqft 3.648750e+01 1.956500e+00 1.864930e+01
## longitude -1.296100e+00 2.398000e-01 -5.405700e+00
## latitude 8.159000e-01 1.652000e-01 4.940200e+00
##
## Residual standard error: 68630 on 4088 degrees of freedom
## (409 observations deleted due to missingness)
plot(lin_mod)




df$pred <- predict(lin_mod, df)
df$pp <- df$price
lin_mod$coefficients
## (Intercept) builtYear bedrooms certified bathrooms
## -1.693325e+06 4.510923e+02 -3.067961e+04 8.703784e-01 8.415230e+04
## sqft longitude latitude
## 1.626081e+02 5.871929e-01 1.376165e+00
df <- py$test
df$id <- as.factor(df$id)
df$longitude = as.numeric(df$longitude) * 100000
df$latitude = as.numeric(df$latitude) * 100000
df <- df %>%
filter(primaryType == 'Residential', sqft != 0,
builtYear > 0) %>%
mutate(byexp = (2023 - builtYear) ** 2) %>%
distinct(id, .keep_all = TRUE)
df_cols <- df %>%
dplyr::select(price, builtYear, bedrooms, bathrooms, sqft, longitude, latitude, certified,
o_hare_noise, pri_est_land, pri_est_bldg)
df_cols$o_hare_noise <-as.double(df_cols$o_hare_noise)
df_cols$pri_est_bldg <-as.double(df_cols$pri_est_bldg)
df_cols$pri_est_land <-as.double(df_cols$pri_est_land)
lin_mod = rlm(price ~ 1 + builtYear + bedrooms + certified +
bathrooms + sqft + longitude +
latitude + o_hare_noise + pri_est_land + pri_est_bldg, data = df_cols)
summary(lin_mod)
##
## Call: rlm(formula = price ~ 1 + builtYear + bedrooms + certified +
## bathrooms + sqft + longitude + latitude + o_hare_noise +
## pri_est_land + pri_est_bldg, data = df_cols)
## Residuals:
## Min 1Q Median 3Q Max
## -1596299 -48350 1430 46690 9716951
##
## Coefficients:
## Value Std. Error t value
## (Intercept) -1.099033e+07 2.139145e+06 -5.137700e+00
## builtYear 2.405557e+02 3.893600e+01 6.178200e+00
## bedrooms -1.098814e+04 1.739843e+03 -6.315600e+00
## certified 1.018800e+00 6.600000e-03 1.553726e+02
## bathrooms 4.667789e+04 2.458262e+03 1.898820e+01
## sqft 5.957970e+01 2.261000e+00 2.635160e+01
## longitude -8.474000e-01 2.764000e-01 -3.065900e+00
## latitude 7.254000e-01 1.834000e-01 3.955600e+00
## o_hare_noise -2.637882e+04 1.256847e+04 -2.098800e+00
## pri_est_land 3.894000e-01 3.080000e-02 1.263440e+01
## pri_est_bldg -1.213000e-01 7.500000e-03 -1.609760e+01
##
## Residual standard error: 70320 on 4052 degrees of freedom
## (442 observations deleted due to missingness)
df_cols$pred <- predict(lin_mod, df_cols)
df_cols$pp <- df$price
df_cols$PriceOverMarket <- df_cols$price - df_cols$pred
df$o_hare_noise <-as.double(df$o_hare_noise)
df$pri_est_bldg <-as.double(df$pri_est_bldg)
df$pri_est_land <-as.double(df$pri_est_land)
bootstrapping <- function(df) {
df <- df
sampledRows <- sample(1:nrow(df), nrow(df), replace = TRUE)
df <- df[sampledRows, ]
bsMod <- rlm(price ~ 1 + builtYear + bedrooms + certified +
bathrooms + sqft + longitude +
latitude + o_hare_noise + pri_est_land + pri_est_bldg, data = df)
results <- broom::tidy(bsMod)
return(results)
}
bs_test <- future(replicate(1000, bootstrapping(df),
simplify = FALSE))
bsCombined <- do.call("rbind", value(bs_test))
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
## Warning: UNRELIABLE VALUE: Future ('<none>') unexpectedly generated random
## numbers without specifying argument 'seed'. There is a risk that those random
## numbers are not statistically sound and the overall results might be invalid.
## To fix this, specify 'seed=TRUE'. This ensures that proper, parallel-safe random
## numbers are produced via the L'Ecuyer-CMRG method. To disable this check, use
## 'seed=NULL', or set option 'future.rng.onMisuse' to "ignore".
hist(bsCombined$statistic[bsCombined$term == "bathrooms"],
col = "black")

df <- py$test
df$id <- as.factor(df$id)
df$longitude = as.numeric(df$longitude) * 100000
df$latitude = as.numeric(df$latitude) * 100000
df <- df %>%
filter(primaryType == 'Residential', sqft != 0,
builtYear > 0) %>%
mutate(byexp = (2023 - builtYear) ** 2) %>%
distinct(id, .keep_all = TRUE)
df$o_hare_noise <-as.double(df$o_hare_noise)
df$pri_est_bldg <-as.double(df$pri_est_bldg)
df$pri_est_land <-as.double(df$pri_est_land)
lin_mod = rlm(price ~ 1 + builtYear + bedrooms + certified +
bathrooms + sqft + longitude +
latitude + o_hare_noise + pri_est_land + pri_est_bldg, data = df)
summary(lin_mod)
##
## Call: rlm(formula = price ~ 1 + builtYear + bedrooms + certified +
## bathrooms + sqft + longitude + latitude + o_hare_noise +
## pri_est_land + pri_est_bldg, data = df)
## Residuals:
## Min 1Q Median 3Q Max
## -1596299 -48350 1430 46690 9716951
##
## Coefficients:
## Value Std. Error t value
## (Intercept) -1.099033e+07 2.139145e+06 -5.137700e+00
## builtYear 2.405557e+02 3.893600e+01 6.178200e+00
## bedrooms -1.098814e+04 1.739843e+03 -6.315600e+00
## certified 1.018800e+00 6.600000e-03 1.553726e+02
## bathrooms 4.667789e+04 2.458262e+03 1.898820e+01
## sqft 5.957970e+01 2.261000e+00 2.635160e+01
## longitude -8.474000e-01 2.764000e-01 -3.065900e+00
## latitude 7.254000e-01 1.834000e-01 3.955600e+00
## o_hare_noise -2.637882e+04 1.256847e+04 -2.098800e+00
## pri_est_land 3.894000e-01 3.080000e-02 1.263440e+01
## pri_est_bldg -1.213000e-01 7.500000e-03 -1.609760e+01
##
## Residual standard error: 70320 on 4052 degrees of freedom
## (442 observations deleted due to missingness)
df$pred <- predict(lin_mod, df)
df$pp <- df$price
df$PriceOverMarket <- df$price - df$pred
plot(lin_mod)



## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
